This notebook is an exploratory analysis of the date for the "Predicting Project Success" project.
Dataset from NYC Department of Parks and Recreation public construction projects. The dataset describes all major infrastructure and information technology projects with a budget of $25 million or more that are currently active (in the design, procurement, or construction phase).
Member of the team:
Imports // Helper Functions // Read Dataset
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').show();
} else {
$('div.input').hide();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import os
import math
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
# BOKEH IMPORTS
from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import HoverTool, ColumnDataSource
output_notebook()
# Plotly imports
from plotly import express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()
import plotly.io as pio
pio.renderers.keys()
pio.renderers.default = 'jupyterlab'
import missingno as msno
# Avoid scientific notation output in Pandas
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.options.display.float_format = '{:,.2f}'.format
import logging
# Improve resolution of output graphcis
%config InlineBackend.figure_format ='retina'
def count_chart(d, count_this, by_this, function='unique'):
"""
Plot counts by column
"""
if function == 'unique':
grouping = d.groupby(by_this).nunique()
elif function == 'count':
grouping = d.groupby(by_this).count()
grouping['left'] = range(0, len(grouping))
grouping['right'] = grouping['left'].values + 1
tools = [HoverTool(
tooltips=[
(f"{by_this}", f"@{by_this}"),
(f"Num {count_this}'s", f"@{count_this}"),]
)
]
p = figure(title=f"Count of Number of {count_this}'s by {by_this}", tools=tools,
plot_height=500, plot_width=700, background_fill_color="#fafafa",
x_range=list(grouping.index))
p.quad(source=ColumnDataSource(grouping[[count_this, 'left', 'right']]),
top=count_this, bottom=0, left='left', right='right',
line_color="white", alpha=0.7)
p.y_range.start = 0
p.xaxis.axis_label = f'{by_this}'
p.yaxis.axis_label = f"Num {count_this}'s"
p.grid.grid_line_color="white"
p.xaxis.major_label_orientation = math.pi/3
show(p)
def hist_chart(d, count_this, for_each):
num = d.groupby(by=for_each).nunique()[count_this]
_left = [x for x in range(1, max(num))]
_right = list(np.array(_left) + 1)
hist, _ = np.histogram(num, bins=len(_left))
tools = [HoverTool(
tooltips=[
(f"This many '{for_each}'", "@top"),
(f"Have this many '{count_this}'", "@left"),
]
)
]
p = figure(title=f"Count of Number of '{count_this}'", tools=tools,
plot_height=300, plot_width=700, background_fill_color="#fafafa")
p.quad(top=hist, bottom=0, left=_left, right=_right, line_color="white", alpha=0.7) #, fill_color="navy", )
p.y_range.start = 0
p.xaxis.axis_label = f"Num '{count_this}'"
p.yaxis.axis_label = f"Num '{for_each}'"
p.grid.grid_line_color="white"
show(p)
def plot_hist_comps(df, metric_1, metric_2, y_log=False, bins=20):
"""Plots side-by-side histograms for comparison with log yscale option
"""
metrics_list = [metric_1, metric_2]
metrics_str = [
metric.replace('_', ' ').upper() for metric in metrics_list
]
fig, ax = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
plt.suptitle(
'Projects by {} and {}'.format(*metrics_str),
fontsize=18
)
for (i, ax), metric_col, metric_name in zip(enumerate(ax), metrics_list, metrics_str):
ax.hist(df[metric_col], bins=bins, alpha=0.7)
ax.axvline(df[metric_col].mean(), color='k', label='mean')
ax.axvline(df[metric_col].quantile(q=.5), color='k', linestyle='--', label='median')
ax.axvline(df[metric_col].quantile(q=.025), color='k', linestyle=':', label='95% range')
ax.axvline(df[metric_col].quantile(q=.975), color='k', linestyle=':')
ax.set_xlabel(metric_name, fontsize=14)
ax.grid(':', alpha=0.4)
if i==0:
ax.set_ylabel('frequency', fontsize=12)
ax.legend(fontsize=12)
if y_log:
ax.set_yscale('log')
if i==0:
ax.set_ylabel('frequency (log scale)', fontsize=12)
plt.tight_layout(rect=[0, 0.03, 1, .94])
plt.show()
def plot_change_trend(trend_data, pid_data, pid):
"""Plots 4 subplots showing project budget and duration forecast change trend
"""
# sets default for converting datetimes in matplotlib
from pandas.plotting import register_matplotlib_converters
from matplotlib.dates import YearLocator, DateFormatter
register_matplotlib_converters()
years = YearLocator()
years_fmt = DateFormatter('%Y')
def set_date_axis(ax, years, years_fmt):
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(years_fmt)
fig, ax = plt.subplots(2,2, sharex=True, figsize=(12,6))
pid_record = pid_data.loc[pid_data['PID']==pid]
pid_changes = trend_data.loc[trend_data['PID']==pid]
project_duration = pid_record['Original_Duration'].values[0] + \
np.cumsum(pid_changes['Latest_Schedule_Changes'].values)
plt.suptitle(
'PID {}: {}\nCategory: {}\nBorough: {}\ninitial duration: {:,.0f} days'.format(
pid,
pid_record['Project_Name'].values[0][:72],
pid_record['Category'].values[0],
pid_record['Borough'].values[0],
pid_record['Original_Duration'].values[0]
), fontsize=16
)
# plot budget forecast
ax[0,0].plot(
pid_changes['Date_Reported_As_Of'], pid_changes['Budget_Forecast']/1e7, 'ko-'
)
ax[0,0].set_title('Total budget forecast')
ax[0,0].set_ylabel('USD (millions)')
# plot budget forecast percent change
ax[1,0].plot(
pid_changes['Date_Reported_As_Of'],
((pid_changes['Latest_Budget_Changes'])/
(pid_changes['Budget_Forecast']-pid_changes['Latest_Budget_Changes']))*100,
'ko-'
)
ax[1,0].axhline(0, color='k', linestyle=':')
ax[1,0].set_title('Percentage budget change')
ax[1,0].set_ylabel('percent change')
ax[1,0].set_xlabel('project change date')
# plot duration forecast
ax[0,1].plot(
pid_changes['Date_Reported_As_Of'], project_duration/1e3, 'ko-'
)
ax[0,1].set_title('Total forecasted project duration')
ax[0,1].set_ylabel('days (thousands)')
# plot duration change
ax[1,1].plot(
pid_changes['Date_Reported_As_Of'],
(pid_changes['Latest_Schedule_Changes'] /
(project_duration - pid_changes['Latest_Schedule_Changes']))*100,
'ko-'
)
ax[1,1].axhline(0, color='k', linestyle=':')
ax[1,1].set_title('Percentage duration change')
ax[1,1].set_ylabel('percent change')
ax[1,1].set_xlabel('project change date')
for a in ax.flat:
a.grid(':', alpha=0.4)
set_date_axis(a, years, years_fmt)
plt.tight_layout(rect=[0, 0.03, 1, .81])
plt.show()
file_path = '../data/Capital_Projects.csv'
if os.path.isfile(file_path):
print("OK - path points to file.")
else:
print("ERROR - check the 'file_path' and ensure it points to the source file.")
data = pd.read_csv(file_path)
# entries
print(f"Number of dataset records: {len(data)}")
# num projects
print(f"Number of unique projects in dataset: {len(data['PID'].unique())}")
# Use 'underscores' in column headers instead of spaces
old_cols = list(data.columns)
new_cols = [s.replace(' ', '_') for s in old_cols]
rename_dict = {k:v for k,v in zip(old_cols, new_cols)}
data.rename(columns=rename_dict, inplace=True)
# Change Date fields to date-type
data['Date_Reported_As_Of'] = pd.to_datetime(data['Date_Reported_As_Of'])
data['Design_Start'] = pd.to_datetime(data['Design_Start'])
data['Forecast_Completion'] = pd.to_datetime(data['Forecast_Completion'])
# make sure data is sorted properly
data = data.sort_values(by=['PID', 'Date_Reported_As_Of'])
To find "weird" partial missing patterns we followed these steps:
weird_missing_pids = []
data_missing = data.drop(columns = ["Latest_Budget_Changes", "Latest_Schedule_Changes"])
for pid, df in data_missing.groupby("PID"):
has_missing_columns = df.isnull().any().any()
if has_missing_columns:
missing_columns = df.columns[df.isnull().any()]
for col in missing_columns:
entire_col_missing = df[col].isnull().all()
if not entire_col_missing:
weird_missing_pids.append(pid)
break
logging.warning(f"Found weird missing pattern for {len(weird_missing_pids)} projects")
## Print all weird missing pids
# for pid, df in data_missing.query("PID in @weird_missing_pids").groupby("PID"):
# print(f"Missing patern for project {pid}")
# msno.matrix(df)
# plt.show()
missing_example_pids = [470, 546, 534, 553, 959]
for pid, df in data_missing.query("PID in @missing_example_pids").groupby("PID"):
print(f"Missing patern for project {pid}")
msno.matrix(df)
plt.show()
OBSERVATION:
There seems to be no patterns to the missing data.
The following cells correct data where errors are evident.
# rename phases to indicate thier sequence
rename_phases = {
'IT':'0-IT',
'Scoping/Planning':'1-Scoping/Planning',
'Design':'2-Design',
'Construction Procurement':'3-Construction Procurement',
'Construction':'4-Construction',
'Close-Out':'5-Close-Out'}
data.Current_Phase = data.Current_Phase.map(rename_phases)
# Create an original budget column
# The first line in any project is the source of the original budget.
# The original budget for the entire project can be calculated from the
# first line of any project:
# Original_Budget = Budget_Forecast - Latest_Budget_Changes
# if 'Latest_Budget_Changes' is NaN, make it 0 (this is the best info we have)
data['Latest_Budget_Changes'] = data['Latest_Budget_Changes'].fillna(0)
# loop through projects - create dict of original_budget values
orig_budgets_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID].sort_values(by='Date_Reported_As_Of')
original_budget = df.iloc[0]['Budget_Forecast']-df.iloc[0]['Latest_Budget_Changes']
orig_budgets_dict.update({PID: original_budget} )
# Several Projects need manual updates because of poor data quality
for k,v in orig_budgets_dict.items():
if math.isnan(v):
print(k, v)
orig_budgets_dict.update({569: 28932000.00})
orig_budgets_dict.update({595: 0})
orig_budgets_dict.update({598: 0})
orig_budgets_dict.update({600: 0})
orig_budgets_dict.update({696: 0})
orig_budgets_dict.update({697: 0})
orig_budgets_dict.update({707: 0})
orig_budgets_dict.update({800: 0})
orig_budgets_dict.update({810: 0})
orig_budgets_dict.update({870: 0})
orig_budgets_dict.update({898: 27500000})
orig_budgets_dict.update({899: 27500000})
orig_budgets_dict.update({900: 27400000})
orig_budgets_dict.update({901: 27500000})
orig_budgets_dict.update({932: 0})
orig_budgets_dict.update({934: 0})
# apply the original budget values to each project
# create columns series and add it to the dataframe
def update_budget(s):
return orig_budgets_dict.get(s.PID)
data.insert(10, 'Original_Budget', data.apply(update_budget, axis=1))
# Check that no values are NaN
data[data['Original_Budget'].isna()]
# Where text descriptions are missing use 'not_specified'
data['Description'] = data['Description'].fillna('not_specified')
data['Borough'] = data['Borough'].fillna('not_specified')
data['Current_Phase'] = data['Current_Phase'].fillna('not_specified')
data['Client_Agency'] = data['Client_Agency'].fillna('not_specified')
# several projects have final lines that have no information - remove them
drop_idxs = data[(data['Current_Phase']=='not_specified') & (data['Design_Start'].isna()) & (data['Latest_Schedule_Changes'].isna()) & (data['Budget_Forecast'].isna()) & (data['Latest_Budget_Changes']==0)].index
data = data.drop(index=drop_idxs)
# NaN values per column
data.isna().sum()
# Fix entries where Total_Budget_Changes is nan
# Can be calculated as the sum of all 'Latest_Budget_Changes'
# loop through projects - create dict of 'Total_Budget_Changes' values
ttl_budget_changes_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID].sort_values(by='Date_Reported_As_Of')
# if no nan's, just take the last ttl value
if len(df['Total_Budget_Changes'].isna()) == 0:
ttl_budget_changes_dict.update({PID: df.iloc[-1]['Total_Budget_Changes']})
else: # there is at least one nan
# if they are all nan - calc as the sum of 'Latest_Budget_Changes'
if data[data['PID']==PID]['Total_Budget_Changes'].isna().all():
ttl_budget_changes_dict.update({PID: df['Latest_Budget_Changes'].sum()})
else: # not all are nan, take the max of 'Total_Budget_Changes'
ttl_budget_changes_dict.update({PID: df['Total_Budget_Changes'].max()})
# apply the total budget changes values to each project
# create columns series and add it to the dataframe
def update_changes(s):
return ttl_budget_changes_dict.get(s.PID)
data['Total_Budget_Changes'] = data.apply(update_changes, axis=1)
# Update/Correct Latest_Schedule_Changes that have nan values
# if the first entry has a nan, make it 0
# loop through projects - create dict of 'Total_Budget_Changes' values
latest_schedule_changes_list = []
new_PID = False
prev_row = None
for i, row in data.iterrows():
# see if this is the first entry for a project
try:
if row.PID != prev_row.PID:
new_PID = True
except:
new_PID = True
x = row['Latest_Schedule_Changes']
# if nan and first entry, make 0
if pd.isna(row['Latest_Schedule_Changes']) and new_PID:
x = 0
# if nan and not first row, calculate
elif pd.isna(row['Latest_Schedule_Changes']) and not new_PID:
# if forecast_completion is nan, we can back into the value
if pd.isna(row['Forecast_Completion']) or pd.isna(prev_row['Forecast_Completion']):
# if the changes are all accounted for, make the entry 0
if row['Total_Schedule_Changes'] - data[data.PID == row.PID]['Latest_Schedule_Changes'].sum() == 0:
x = 0
# see if this is the only NA row, we can back into the value
elif data[data.PID == row.PID]['Latest_Schedule_Changes'].isna().sum() == 1:
x = row['Total_Schedule_Changes'] - data[data.PID == row.PID]['Latest_Schedule_Changes'].sum()
# othewise, can't calculate
else: # there is a forecast_completion date and we can easily calculte the change
x = row['Forecast_Completion'] - prev_row['Forecast_Completion']
latest_schedule_changes_list.append(x)
new_PID = False
prev_row = row.copy()
data['Latest_Schedule_Changes'] = latest_schedule_changes_list
# this project is a lost cause
data[data['Latest_Schedule_Changes'].isna()]
# Now, calculate any missing Forecast_Completion values
fc_completion_list = []
new_PID = False
prev_row = None
for i, row in data.iterrows():
# see if this is the first entry for a project
try:
if row.PID != prev_row.PID:
new_PID = True
except:
new_PID = True
x = row['Forecast_Completion']
if pd.isna(x) and not new_PID:
# we can back into the date
x = prev_row['Forecast_Completion'] + timedelta(int(row['Latest_Schedule_Changes']))
fc_completion_list.append(x)
data['Forecast_Completion'] = fc_completion_list
# now calculate any missing Total_Schedule_Changes
ttl_fc_changes_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID].sort_values(by='Date_Reported_As_Of')
# if no nan's, just take the last ttl value
if len(df['Total_Schedule_Changes'].isna()) == 0:
ttl_fc_changes_dict.update({PID: df.iloc[-1]['Total_Schedule_Changes']})
else: # there is at least one nan
# if they are all nan - calc as the sum of 'Latest_Schedule_Changes'
if data[data['PID']==PID]['Total_Schedule_Changes'].isna().all():
ttl_fc_changes_dict.update({PID: df['Latest_Schedule_Changes'].sum()})
else: # not all are nan, take the max of 'Total_Schedule_Changes'
ttl_fc_changes_dict.update({PID: df['Total_Schedule_Changes'].max()})
def update_changes(s):
return ttl_fc_changes_dict.get(s.PID)
data['Total_Schedule_Changes'] = data.apply(update_changes, axis=1)
# Create an original schedule column
# in the first line of a project, back into the origonal date
# this is the best that we can do
try:
data.drop(columns=['Original_Schedule'], inplace=True)
except:
pass
orig_sched_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID]
x = df.iloc[0]['Forecast_Completion'] - timedelta(days=int(df.iloc[0]['Latest_Schedule_Changes']))
# if the first record couldn't be used, use the last record
if pd.isna(x):
x = df.iloc[-1]['Forecast_Completion']- timedelta(days=int(df.iloc[-1]['Total_Schedule_Changes']))
orig_sched_dict.update({PID: x})
def update_schedule(s):
return orig_sched_dict.get(s.PID)
data.insert(14, 'Original_Schedule', data.apply(update_schedule, axis=1))
data.isna().sum()
# All projects in 'Scoping/Planning' phase have no scheduled date yet. Make themm = 0.
x_idx = data[data['Current_Phase']=='1-Scoping/Planning'].index
for i in x_idx:
data.loc[i] = data.loc[i].fillna(0)
data.isna().sum()
How many updates do we have for each project?
hist_chart(d=data, count_this='Date_Reported_As_Of', for_each='PID')
How many updates do we have for each project Category?
px.histogram(data, x="Date_Reported_As_Of", color ="Category", barmode="group")
px.histogram(data, x="Date_Reported_As_Of", color ="Category", barmode="group", facet_col="Managing_Agency", facet_col_wrap = 4)
Distribution of projects by categories
count_chart(d=data, count_this='PID', by_this='Category')
Distribution of projects by Boroughs
count_chart(d=data, count_this='PID', by_this='Borough')
count_chart(d=data, count_this='PID', by_this='Managing_Agency')
count_chart(d=data, count_this='PID', by_this='Client_Agency')
count_chart(d=data, count_this='PID', by_this='Current_Phase')
# updates per phase
count_chart(d=data, count_this='Date_Reported_As_Of', by_this='Current_Phase', function='count')
OBSERVATION:
It's clear that in many aspects, we have a big problem of imbalanced classes/categories
# identify records with '0' value 'Design_Start', causing datetime conversion errors
drop_PID = list(data.loc[data['Design_Start']==0]['PID'].values)
n_drop_PID_records = len(data.loc[data['Design_Start']==0]['PID'])
n_records = len(data)
# print summary of records
print(
'The following projects have zero-valued "Design_Start" dates and '\
'will be dropped, resulting in {} total records dropped:\n\n\t{}\n'\
''.format(n_drop_PID_records, drop_PID)
)
# drop identified PID records
data = data.copy().loc[~data['PID'].isin(drop_PID)]
print(
'The resulting dataframe, now contains {:,} records'.format(len(data))
)
# re-establish datetime columns as correct type
date_cols = [
'Date_Reported_As_Of',
'Forecast_Completion',
'Design_Start',
'Original_Schedule'
]
for col in date_cols:
data[col] = pd.to_datetime(data[col])
# print summary stats for datetime cols
data[date_cols].describe()
OBSERVATION:
Note the transposed year value (i.e. $2108$) present in the Forecast_Completion values. Before continuing with this trend analysis, we will assume that this is the result of a keying error and will fix this value by setting it to $2018$.
# fix invalid year "2108", assuming it was a keying error
data.loc[data['Forecast_Completion']=='2108-10-21', 'Forecast_Completion'] = pd.Timestamp('2018-10-21')
# print summary stats for datetime cols once more
data[date_cols].describe()
# verify resuling data types
data.info()
# convert dataframe single record for each PID
df_pid = data.groupby('PID').agg(
{
'Project_Name': 'first',
'Category': 'first',
'Borough': 'first',
'Managing_Agency': 'first',
'Client_Agency': 'first',
'Date_Reported_As_Of': 'count',
'Original_Budget': 'first',
'Design_Start': min,
'Original_Schedule': 'first',
'Forecast_Completion': 'last',
'Total_Schedule_Changes': max,
'Total_Budget_Changes': max
}
).reset_index()
# rename column for number of changes
df_pid = df_pid.rename(columns={'Date_Reported_As_Of': 'Number_Changes'})
# define original forecasted project duration
df_pid['Original_Duration'] = (df_pid['Original_Schedule'] - df_pid['Design_Start']).dt.days
# define budget change ratio
df_pid['Budget_Change_Ratio'] = df_pid['Total_Budget_Changes']/df_pid['Original_Budget']
# define schedule change ratio
df_pid['Schedule_Change_Ratio'] = df_pid['Total_Schedule_Changes']/df_pid['Original_Duration']
# Drop projects with missing or zero values for each of the following columns
drop_filter_cols = [
'Original_Budget',
'Design_Start',
'Original_Schedule'
]
# define df.loc filter to simplify list comprehension and drop functions
loc_filter = lambda df, col: (df[col]==0) | (df[col].isnull())
# store list of tuples identify the PID and corresponding category of each to be dropped
drop_pids = sum(
[
list(
zip(
[*df_pid.loc[loc_filter(df_pid, col)]['PID'].values],
[*df_pid.loc[loc_filter(df_pid, col)]['Category'].values]
)
) for col in drop_filter_cols
], []
)
# print summary of PIDs to be dropped
print(
'The following PIDs will be dropped because they have missing or '\
'zero values for the columns:\n\n\t{}\n\nPID\tCategory'.format(drop_filter_cols)
)
for pid in drop_pids:
print('{}\t{}'.format(*pid))
# drop PIDs from dataframe
for col in drop_filter_cols:
df_pid = df_pid.copy().loc[~loc_filter(df_pid, col)]
# print summary of remaining PIDs
print(
'\nThe resulting dataframe contains {} unique projects\n'.format(len(df_pid))
)
OBSERVATION:
IT projects don't seem to follow the patterns/rules of other projects
%%time
# generate scatter matrix with each quantitative metric
scatter_cols = [
'Number_Changes',
'Original_Budget',
'Total_Budget_Changes',
'Budget_Change_Ratio',
'Original_Duration',
'Total_Schedule_Changes',
'Schedule_Change_Ratio'
]
g = sns.pairplot(df_pid[scatter_cols], plot_kws={'alpha': 0.3, 's': 100})
g.fig.suptitle('Distribution of budget and duration changes by project', fontsize=22, y=1.015)
plt.show();
OBSERVATIONS:
The above scatter matrix demonstrates, on a project-by-project basis, the correlation between quantitative features associated with each of the 355 unique projects that remain in the dataset. Of greatest interest here are relationships between the various "project change"-related measurements associated with each project. Of additional concern is the relative scale (in terms of budget and schedule duration) of the various projects measured.
In the subplots above, while there appears to be slight linear relationships between several features -- most notably the relationships between total schedule change (in days) and schedule change ratio, as well as number of project changes versus schedule-related metrics -- many of these relationships have fairly extreme outlying projects, which may obscure more nuanced relationships by compressing the majority of data points along either the $x$ or $y$ axes. The various relationships plotted against the budget and budget change features clearly exhibit this outlier behavior.
While the removal of these outliers and closer examination of non-outlying datapoints does warrant additional investigation, the remainder of this "change trend" analysis will instead focus on projects with extreme budget and schedule characteristics.
To do this, we will next look a little more closely at the distributions of budget- and schedule-related measures by project to identify some of the projects with these "extreme" measures.
plot_hist_comps(df_pid, 'Original_Budget', 'Original_Duration', y_log=True)
plot_hist_comps(df_pid, 'Total_Budget_Changes', 'Total_Schedule_Changes', y_log=True)
plot_hist_comps(df_pid, 'Budget_Change_Ratio', 'Schedule_Change_Ratio', y_log=True)
OBSERVATIONS:
As can be seen in the paired histogram plots above, most features exhibit distinct right-skewed distributions among projects. Most notably, among the "original" budget and duration as well as the "change ratios" for both budget and duration, clear outliers exist at extremely high values (these ratios are defined as the "total changes" divided by the "original" values for each project).
While a more in-depth statistical analysis will be needed to explore the aggregate change trends among all projects measured, for the purpose of this initial EDA, we will just visually inspect the time-series changes that have taken place among some of these more "extreme" projects below.
def identify_edge_pids(df, metric_col, n_pids=2,
change_threshold=5, change_col='Number_Changes'):
"""Identifies PIDs with minimum and maximum values for any given metric
"""
loc_filter = df[change_col]>=change_threshold
max_pids = list(
df.loc[loc_filter].sort_values(by=metric_col, ascending=True)[-n_pids:]['PID']
)
min_pids = list(
df.loc[loc_filter].sort_values(by=metric_col, ascending=True)[:n_pids]['PID']
)
return {'min': min_pids, 'max': max_pids}
metric_col_list = [
'Original_Budget',
'Original_Duration',
'Budget_Change_Ratio',
'Schedule_Change_Ratio',
]
change_threshold = 5
edge_pids_dict = {
metric_col: identify_edge_pids(
df_pid, metric_col, n_pids=2, change_threshold=change_threshold
)
for metric_col in metric_col_list
}
print(
'Projects with {} or more change records, and the largest and '\
'smallest metric values are as follows:\n'.format(change_threshold)
)
for key, value in edge_pids_dict.items():
print('{}\n'.format(key))
for edge, pid_list in value.items():
print('{}:\t{}'.format(edge, pid_list))
print()
plot_change_trend(data, df_pid, 603)
plot_change_trend(data, df_pid, 480)
The two projects plotted above represent:
OBSERVATIONS:
In the case of both these projects, extremely large re-forecasts in budget and schedule were made during the life of the projects. Conversely, both started with extremely small forecasts relative to other projects in this dataset. In both these cases, we are led to question the validity of the original scopes for both projects. Either that, or perhaps significant additional scope was added to each project after the original design was completed.
plot_change_trend(data, df_pid, 555)
The project above represents:
OBSERVATIONS:
It the above project change timeline, it is interesting to note the overall reduction in budget forecast over time, as well as the erratic short-term reduction in forecasted duration. Additional information on a project such as this would be of interest to understand the temporary reduction in hours by more that 1,500 days.
plot_change_trend(data, df_pid, 96)
The project above represents:
OBSERVATIONS:
Althought this project began as the project with the longest original scheduled duration, at more than 8,800 days, it is intersting to note that this scope has still increased by an additional 1,800 days.
plot_change_trend(data, df_pid, 537)
plot_change_trend(data, df_pid, 482)
The project above represents:
PID 537: The smallest Budget Change Ratio
PID 482: The smallest Schedule Change Ratio
OBSERVATIONS:
Both of the above plots exhibit fairly extreme reductions in scope over the life of the project. PID 537 decreased in budget forecast from more than \$30 million to less than \\$5 million with no corresponding reduction in scheduled duration. The second, PID 482, exhibited a reduction in scheduled duration from more than 3,000 days, to fewer than 2,000 days, but only temporarily exhibited a reduction in forecasted budget.
CONCERNS:
These two projects, as well as the others examined above, leads us to wonder the validity of some of the measures reported in this data, as well as the confounding factors that may also be influencing some of these changes in scope and forecast.
Metrics calculated:
df_pid = df_pid.eval('Budget_To_Date = Original_Budget + Total_Budget_Changes')\
.eval('Budget_Abs_Per_Error = abs(Original_Budget - Budget_To_Date)/Budget_To_Date')\
.eval('Budget_Rel_Per_Error = abs(Original_Budget - Budget_To_Date)/Original_Budget')\
.eval('Mark_Metric = (Original_Budget/Budget_To_Date) - 1')\
.eval('Budget_Ratios = Budget_To_Date/Original_Budget')
df_pid
df_pid_melted = df_pid.melt(id_vars=set(df_pid.columns) - set(["Budget_Abs_Per_Error", "Budget_Rel_Per_Error", 'Budget_Ratios', "Mark_Metric"]), value_vars=["Budget_Abs_Per_Error", "Budget_Rel_Per_Error", 'Budget_Ratios', "Mark_Metric"], var_name="metric", value_name="metric_value")
df_pid_melted
px.histogram(df_pid_melted,x="metric_value", facet_col="metric", color="Category", barmode="overlay" )
fig = px.line(data.sort_values(['Date_Reported_As_Of', "PID"]), x='Date_Reported_As_Of', y="Latest_Budget_Changes", color= "Category", line_dash="PID", hover_name="PID", height=1000)
fig.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
)
)
fig
OBSERVATION
There's a few projects that have enormous changes and skew the plot, while others are more clustered
fig = px.line(data.sort_values(['Date_Reported_As_Of', "PID"]), x='Date_Reported_As_Of', y="Latest_Budget_Changes", facet_col = "Borough", color= "Category", hover_name="PID", line_group="PID", facet_col_wrap=5, height= 2000)
fig
OBSERVATION:
Borough specified, so it's unlikely we could use it as predictor for budget/schedule changesBridges)Poor data quality. The trend analyses for a handful of projects led us to wonder if the validity of some of the measures reported in this data, as well as the confounding factors that may also be influencing some of these changes in scope and forecast.
Data is inconsistent in quality. Human-entered data will always have inconsistencies. We will need to work around or with that.
Any model will need more data. We will need to source more project data from other regions and types of projects to increase the variety
Lots of missing data. Any analyses would have to dive into the various biases that removing the missing data entails
To feed a model, some data engineering will be necessary. The types of dat that need to be created are not yet understood.
Any efforts to normalize data will need to consider normalization within the project as well as within the entire dataset.
Given everything you have learned, if you faced this data set in the wild, how would you proceed?
Need to do some investigations on some of the projects and their budget/schedule deviations. Simply removing the projects with seemingly erroneous or inconsistent data could lead to a biased dataset.
Find secondary sources of data on the projects/agencies and management practices, how data is collected and incentive systems.
Collect information on how the project management methodology in the specific reporting agencies.
What are the important measures?
What are the right questions to ask, and how can the data answer them?
Are there any quantitative indicators at the beginning of the project that would help us predict the risk of cost and time overrun?
Are there indicators during the life cycle of a project that would help us predict the risk of cost and time overrun?
The data we have right now is insufficient to answer these questions. All of the projects we currently have data for have not finished.
Additional data that we need would be: